{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm\n",
    "\n",
    "pd.set_option('display.max_columns', 30)\n",
    "pd.set_option('display.max_rows', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%ls /TimeseriesDatasets/forecasting/fred\n",
    "!head -n 5 /TimeseriesDatasets/forecasting/fred/ts_meta.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PATH = '/TimeseriesDatasets/forecasting/fred/'\n",
    "OUTPUT_PATH = PATH + 'preprocessed/'\n",
    "if not os.path.exists(OUTPUT_PATH):\n",
    "    os.makedirs(OUTPUT_PATH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_info = pd.read_csv(PATH+'FREDInfo.csv')\n",
    "print(len(df_info))\n",
    "print(df_info['FREDid'].nunique())\n",
    "df_info.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_ids = df_info['FREDid'].unique()\n",
    "print(len(unique_ids))\n",
    "unique_ids[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_info['SP'].unique(), df_info['Horizon'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_meta = pd.read_csv(PATH+'ts_meta.csv')\n",
    "print(len(df_meta))\n",
    "print(df_meta['id'].nunique())\n",
    "df_meta.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ids_meta = set(df_meta['id'].unique())\n",
    "ids_info = set(df_info['FREDid'].unique())\n",
    "common_elements = ids_meta & ids_info\n",
    "print(len(common_elements) / len(ids_meta))\n",
    "print(len(common_elements) / len(ids_info))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = df_meta[df_meta['id'] == unique_ids[0]].iloc[0]\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import h5py\n",
    "\n",
    "# with h5py.File(PATH+'raw_observations.h5', 'r') as file:\n",
    "#     print(file.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open(PATH + 'categories.pickle', 'rb') as file:\n",
    "        categories = pickle.load(file)\n",
    "len(categories)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_complete = np.load(PATH + 'fred-complete.npz', allow_pickle=True)\n",
    "data_train = np.load(PATH + 'fred-train.npz', allow_pickle=True)\n",
    "data_test = np.load(PATH + 'fred-test.npz', allow_pickle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Complete')\n",
    "print(data_complete.shape)\n",
    "print(data_complete[0].shape)\n",
    "\n",
    "print('Train')\n",
    "print(data_train.shape)\n",
    "print(data_train[0].shape)\n",
    "\n",
    "print('Test')\n",
    "print(data_test.shape)\n",
    "print(data_test[0].shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_final = df_info\n",
    "seasonalities = {s: dict() for s in df_final['SP'].unique()}\n",
    "\n",
    "for i, (\n",
    "    (index, info),\n",
    "    series_complete\n",
    "    ) in tqdm(enumerate(zip(\n",
    "        df_final.iterrows(),\n",
    "        data_complete\n",
    "        )),\n",
    "        total=len(df_final)\n",
    "    ):\n",
    "    # get series ID and extract its metadata\n",
    "    fred_id = info['FREDid']\n",
    "    metadata = df_meta[df_meta['id'] == fred_id]\n",
    "    df_final.loc[index, metadata.columns] = metadata.iloc[0]\n",
    "\n",
    "    # get seasonality and save series\n",
    "    seasonality = df_final.at[df_final[df_final['FREDid'] == fred_id].index[0], 'SP']\n",
    "    series_complete_clean = series_complete[~np.isnan(series_complete)] # remove nans\n",
    "    seasonalities[seasonality][fred_id] = series_complete_clean\n",
    "\n",
    "df_final.drop(['id'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_final.to_csv(OUTPUT_PATH + 'FRED_meta_test.csv', index=False)\n",
    "\n",
    "for seasonality, series in seasonalities.items():\n",
    "    print(f'{seasonality}: {len(series)}')\n",
    "    np.savez(OUTPUT_PATH + f'FRED_{seasonality}_test.npy', series)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load preprocessed files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_meta = pd.read_csv(OUTPUT_PATH + 'FRED_meta.csv', low_memory=False)\n",
    "df_meta.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = np.load(OUTPUT_PATH + 'FRED_Yearly.npy', allow_pickle=True)\n",
    "data = data[()] # the initial data is the zero-dimensional where the only element is the dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for key, value in data.items():\n",
    "    meta = df_meta[df_meta['FREDid'] == key]\n",
    "    print(key, value.shape)\n",
    "    print(meta['SP'].values[0])\n",
    "    plt.plot(value)\n",
    "    plt.show()\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "moment",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
